{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.model_selection import cross_val_score\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"labeleddataformodel.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[['title', 'isupliftinganecdote']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3603\n",
      "0.767142585189\n",
      "[ 0.78640777  0.72815534  0.75        0.81862745  0.74509804  0.78431373\n",
      "  0.76470588  0.7745098   0.75490196  0.76470588]\n"
     ]
    }
   ],
   "source": [
    "#without stopword removal maximum score on 3603 max features\n",
    "#with stopword removal max score on range around 3300, with other peak around 3812\n",
    "#worse with stopword removal\n",
    "\n",
    "for i in range(3603, 3604):\n",
    "    countvec = CountVectorizer(max_features = i)\n",
    "    #countvec = CountVectorizer(stop_words = \"english\", max_features = i)\n",
    "    vectors = countvec.fit_transform(data['title'])\n",
    "    labels = data['isupliftinganecdote']\n",
    "    mnb = MultinomialNB()\n",
    "    mnb.fit(vectors, labels)\n",
    "    print(i)\n",
    "    scores = cross_val_score(mnb, vectors, labels, cv=10)\n",
    "    print(sum(scores) / len(scores))\n",
    "    print(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2, 3603)"
      ]
     },
     "execution_count": 226,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title_tokens = countvec.get_feature_names()\n",
    "\n",
    "\n",
    "mnb.feature_count_\n",
    "mnb.feature_count_.shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 14.,   1.,   2., ...,   1.,   1.,   1.])"
      ]
     },
     "execution_count": 227,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of times each token appears across all nonua titles\n",
    "nonua_token_count = mnb.feature_count_[0, :]\n",
    "nonua_token_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 17.,   1.,   0., ...,   0.,   0.,   0.])"
      ]
     },
     "execution_count": 228,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of times each token appears across all ua titles\n",
    "ua_token_count = mnb.feature_count_[1, :]\n",
    "ua_token_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nonua</th>\n",
       "      <th>ua</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>token</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>000</th>\n",
       "      <td>14.0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>03</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>08</th>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>19.0</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>6.0</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       nonua    ua\n",
       "token             \n",
       "000     14.0  17.0\n",
       "03       1.0   1.0\n",
       "08       2.0   0.0\n",
       "10      19.0   7.0\n",
       "100      6.0   6.0"
      ]
     },
     "execution_count": 229,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a DataFrame of tokens with their separate nonua and ua counts\n",
    "tokens = pd.DataFrame({'token':title_tokens, 'nonua':nonua_token_count, 'ua':ua_token_count}).set_index('token')\n",
    "tokens.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1022.,  1022.])"
      ]
     },
     "execution_count": 230,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mnb.class_count_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nonua</th>\n",
       "      <th>ua</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>token</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dc</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>stocking</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>circus</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>silverback</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pockets</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            nonua   ua\n",
       "token                 \n",
       "dc            3.0  2.0\n",
       "stocking      2.0  1.0\n",
       "circus        1.0  3.0\n",
       "silverback    2.0  1.0\n",
       "pockets       2.0  1.0"
      ]
     },
     "execution_count": 231,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add 1 to ua and nonua counts to avoid dividing by 0\n",
    "tokens['nonua'] = tokens.nonua + 1\n",
    "tokens['ua'] = tokens.ua + 1\n",
    "tokens.sample(5, random_state=6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nonua</th>\n",
       "      <th>ua</th>\n",
       "      <th>ua_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>token</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dc</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>stocking</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>circus</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>silverback</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pockets</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            nonua   ua  ua_ratio\n",
       "token                           \n",
       "dc            3.0  2.0  0.666667\n",
       "stocking      2.0  1.0  0.500000\n",
       "circus        1.0  3.0  3.000000\n",
       "silverback    2.0  1.0  0.500000\n",
       "pockets       2.0  1.0  0.500000"
      ]
     },
     "execution_count": 232,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate the ratio of ua-to-nonua for each token\n",
    "tokens['ua_ratio'] = tokens.ua / tokens.nonua\n",
    "tokens.sample(5, random_state=6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nonua</th>\n",
       "      <th>ua</th>\n",
       "      <th>ua_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>token</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>puppy</th>\n",
       "      <td>1.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>21.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>homeless</th>\n",
       "      <td>1.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>21.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>saves</th>\n",
       "      <td>1.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>16.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>child</th>\n",
       "      <td>1.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>15.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>kitten</th>\n",
       "      <td>1.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>15.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pit</th>\n",
       "      <td>1.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>15.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>lost</th>\n",
       "      <td>1.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hero</th>\n",
       "      <td>1.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>12.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bull</th>\n",
       "      <td>1.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>12.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dad</th>\n",
       "      <td>2.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>11.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>little</th>\n",
       "      <td>2.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>proud</th>\n",
       "      <td>1.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sweet</th>\n",
       "      <td>1.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>adorable</th>\n",
       "      <td>2.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>touching</th>\n",
       "      <td>1.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>stray</th>\n",
       "      <td>1.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>note</th>\n",
       "      <td>1.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pet</th>\n",
       "      <td>1.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>adopted</th>\n",
       "      <td>1.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>happy</th>\n",
       "      <td>1.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>birthday</th>\n",
       "      <td>2.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>saved</th>\n",
       "      <td>2.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>8.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>newborn</th>\n",
       "      <td>1.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>husky</th>\n",
       "      <td>1.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>blind</th>\n",
       "      <td>1.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>moment</th>\n",
       "      <td>1.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>12.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>brings</th>\n",
       "      <td>1.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>brothers</th>\n",
       "      <td>1.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pictures</th>\n",
       "      <td>1.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>based</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>blocks</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tutorials</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>style</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>russia</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>court</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>feeling</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>marijuana</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sanders</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>legal</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>low</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tax</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claims</th>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.166667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fbi</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hillary</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>china</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>quiz</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>list</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aged</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>admits</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>administration</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hairstyles</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dead</th>\n",
       "      <td>15.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.133333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016</th>\n",
       "      <td>9.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.111111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>clinton</th>\n",
       "      <td>9.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.111111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>easy</th>\n",
       "      <td>10.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.100000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>president</th>\n",
       "      <td>12.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.083333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>trump</th>\n",
       "      <td>99.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.080808</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ideas</th>\n",
       "      <td>13.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.076923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>donald</th>\n",
       "      <td>13.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.076923</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3603 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                nonua    ua   ua_ratio\n",
       "token                                 \n",
       "puppy             1.0  21.0  21.000000\n",
       "homeless          1.0  21.0  21.000000\n",
       "saves             1.0  16.0  16.000000\n",
       "child             1.0  15.0  15.000000\n",
       "kitten            1.0  15.0  15.000000\n",
       "pit               1.0  15.0  15.000000\n",
       "lost              1.0  13.0  13.000000\n",
       "hero              1.0  12.0  12.000000\n",
       "bull              1.0  12.0  12.000000\n",
       "dad               2.0  22.0  11.000000\n",
       "little            2.0  20.0  10.000000\n",
       "proud             1.0  10.0  10.000000\n",
       "sweet             1.0  10.0  10.000000\n",
       "adorable          2.0  20.0  10.000000\n",
       "touching          1.0  10.0  10.000000\n",
       "stray             1.0  10.0  10.000000\n",
       "note              1.0   9.0   9.000000\n",
       "pet               1.0   9.0   9.000000\n",
       "adopted           1.0   9.0   9.000000\n",
       "happy             1.0   9.0   9.000000\n",
       "birthday          2.0  18.0   9.000000\n",
       "saved             2.0  17.0   8.500000\n",
       "newborn           1.0   8.0   8.000000\n",
       "husky             1.0   8.0   8.000000\n",
       "blind             1.0   8.0   8.000000\n",
       "moment            1.0   8.0   8.000000\n",
       "dog              12.0  96.0   8.000000\n",
       "brings            1.0   8.0   8.000000\n",
       "brothers          1.0   8.0   8.000000\n",
       "pictures          1.0   8.0   8.000000\n",
       "...               ...   ...        ...\n",
       "based             6.0   1.0   0.166667\n",
       "blocks            6.0   1.0   0.166667\n",
       "tutorials         6.0   1.0   0.166667\n",
       "style             6.0   1.0   0.166667\n",
       "russia            6.0   1.0   0.166667\n",
       "court             6.0   1.0   0.166667\n",
       "feeling           6.0   1.0   0.166667\n",
       "marijuana         6.0   1.0   0.166667\n",
       "sanders           6.0   1.0   0.166667\n",
       "legal             6.0   1.0   0.166667\n",
       "low               6.0   1.0   0.166667\n",
       "tax               6.0   1.0   0.166667\n",
       "claims            6.0   1.0   0.166667\n",
       "fbi               7.0   1.0   0.142857\n",
       "hillary           7.0   1.0   0.142857\n",
       "china             7.0   1.0   0.142857\n",
       "quiz              7.0   1.0   0.142857\n",
       "list              7.0   1.0   0.142857\n",
       "aged              7.0   1.0   0.142857\n",
       "admits            7.0   1.0   0.142857\n",
       "administration    7.0   1.0   0.142857\n",
       "hairstyles        7.0   1.0   0.142857\n",
       "dead             15.0   2.0   0.133333\n",
       "2016              9.0   1.0   0.111111\n",
       "clinton           9.0   1.0   0.111111\n",
       "easy             10.0   1.0   0.100000\n",
       "president        12.0   1.0   0.083333\n",
       "trump            99.0   8.0   0.080808\n",
       "ideas            13.0   1.0   0.076923\n",
       "donald           13.0   1.0   0.076923\n",
       "\n",
       "[3603 rows x 3 columns]"
      ]
     },
     "execution_count": 233,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the DataFrame sorted by ua_ratio\n",
    "# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier\n",
    "tokens.sort_values('ua_ratio', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          nonua    ua  ua_ratio\n",
      "token                          \n",
      "puppy       1.0  21.0      21.0\n",
      "homeless    1.0  21.0      21.0\n",
      "saves       1.0  16.0      16.0\n",
      "child       1.0  15.0      15.0\n",
      "kitten      1.0  15.0      15.0\n"
     ]
    }
   ],
   "source": [
    "dataframe = tokens.sort_values('ua_ratio', ascending=False)\n",
    "print(dataframe.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          nonua    ua  ua_ratio\n",
      "token                          \n",
      "puppy       1.0  21.0      21.0\n",
      "homeless    1.0  21.0      21.0\n",
      "saves       1.0  16.0      16.0\n",
      "child       1.0  15.0      15.0\n",
      "kitten      1.0  15.0      15.0\n",
      "pit         1.0  15.0      15.0\n",
      "lost        1.0  13.0      13.0\n",
      "hero        1.0  12.0      12.0\n",
      "bull        1.0  12.0      12.0\n",
      "dad         2.0  22.0      11.0\n",
      "little      2.0  20.0      10.0\n",
      "proud       1.0  10.0      10.0\n",
      "sweet       1.0  10.0      10.0\n",
      "adorable    2.0  20.0      10.0\n",
      "touching    1.0  10.0      10.0\n",
      "stray       1.0  10.0      10.0\n",
      "note        1.0   9.0       9.0\n",
      "pet         1.0   9.0       9.0\n",
      "adopted     1.0   9.0       9.0\n",
      "happy       1.0   9.0       9.0\n",
      "                nonua   ua  ua_ratio\n",
      "token                               \n",
      "low               6.0  1.0  0.166667\n",
      "tax               6.0  1.0  0.166667\n",
      "claims            6.0  1.0  0.166667\n",
      "fbi               7.0  1.0  0.142857\n",
      "hillary           7.0  1.0  0.142857\n",
      "china             7.0  1.0  0.142857\n",
      "quiz              7.0  1.0  0.142857\n",
      "list              7.0  1.0  0.142857\n",
      "aged              7.0  1.0  0.142857\n",
      "admits            7.0  1.0  0.142857\n",
      "administration    7.0  1.0  0.142857\n",
      "hairstyles        7.0  1.0  0.142857\n",
      "dead             15.0  2.0  0.133333\n",
      "2016              9.0  1.0  0.111111\n",
      "clinton           9.0  1.0  0.111111\n",
      "easy             10.0  1.0  0.100000\n",
      "president        12.0  1.0  0.083333\n",
      "trump            99.0  8.0  0.080808\n",
      "ideas            13.0  1.0  0.076923\n",
      "donald           13.0  1.0  0.076923\n"
     ]
    }
   ],
   "source": [
    "#print most and least probable to be ua words, top and bottom 20\n",
    "print(dataframe[0:20])\n",
    "\n",
    "print(dataframe[3583:3603])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [],
   "source": [
    "#write data to csv\n",
    "docname = \"/Users/tessmcnulty/uawordprobabilities.csv\"\n",
    "dataframe.to_csv(docname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
